Exploratory Analysis

Preparing Workspace

Load necessary packages

library(caret) # Classification and Regression Training
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library(corrplot)
## corrplot 0.84 loaded
library(dplyr) # A Grammar of Data Manipulations
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats) # Tools for Working with Categorical Variables (Factors)
## Warning: package 'forcats' was built under R version 3.5.2
library(gdata) # Various R Programming Tools for Data Manipulation
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(ggplot2) # Create Elegant Data Visualizations Using the Grammar of Graphics
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with ‘ggplot2’
## Warning: package 'ggrepel' was built under R version 3.5.2
library(ggridges) # Ridgeline Plots in ‘ggplot2’
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual
library(ggthemes) # Extra Themes, Scales, and Geoms for ‘ggplot2’
## Warning: package 'ggthemes' was built under R version 3.5.2
library(gmodels) # Various R Programming Tools for Model Fitting
library(grid) # The Grid Graphics Package
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:gdata':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
## Warning: package 'knitr' was built under R version 3.5.2
library(lmtest) # Testing Linear Regression Models
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(plyr) # Tools for Splitting, Applying and Combining Data
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(purrr) # Functional Programming Tools
## Warning: package 'purrr' was built under R version 3.5.2
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:gdata':
## 
##     keep
## The following object is masked from 'package:caret':
## 
##     lift
library(readr) # Read Rectangular Text Data
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.2
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(scales) # Scale Functions for Visualization
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following object is masked from 'package:purrr':
## 
##     discard
library(shiny) # Web Application Framework for R
## Warning: package 'shiny' was built under R version 3.5.2
library(skimr) # Compact and Flexible Summaries of Data
## Warning: package 'skimr' was built under R version 3.5.2
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
## 
##     kable
## The following object is masked from 'package:stats':
## 
##     filter
library(stringr) # Simple, Consistent Wrappers for Common String Operations
library(tibble) # Simple Data Frames
## Warning: package 'tibble' was built under R version 3.5.2
library(tidyr) # Easily Tidy Data with ‘spread()’ and ‘gather()’ Functions
## Warning: package 'tidyr' was built under R version 3.5.2
library(tidyverse) # Install and Load ‘Tidyverse’ Packages
library(usmap)
## Warning: package 'usmap' was built under R version 3.5.2
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
## The following object is masked from 'package:plyr':
## 
##     ozone
library(mapdata)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
## 
##     theme_nothing
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library("googleway")
library("ggspatial") 
library("rnaturalearth") 
library("rnaturalearthdata")
theme_set(theme_bw())
library("sf")
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(gridExtra)

world <- ne_countries(scale = "medium", returnclass = "sf") #for mapping

Load cleaned data

clean_data <- readRDS("../../data/processed_data/processeddata.rds")
skim(clean_data)
## Skim summary statistics
##  n obs: 13165 
##  n variables: 23 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────
##               variable missing complete     n n_unique
##              bioregion       0    13165 13165        6
##              georegion       0    13165 13165        9
##             group_code       0    13165 13165        9
##  group_code_UCSC_other       0    13165 13165        2
##                 island       0    13165 13165        6
##     marine_season_code       0    13165 13165       57
##       marine_site_name       0    13165 13165       77
##            method_code       0    13165 13165        4
##   method_code_IP_other       0    13165 13165        2
##             mpa_region       0    13165 13165        6
##            season_name       0    13165 13165        4
##              site_code       0    13165 13165       77
##           species_code       0    13165 13165        3
##                  state       0    13165 13165        4
##                                              top_counts ordered
##   San: 5608, Oly: 4138, Gov: 2366, Sal: 652               FALSE
##   CA : 5382, CA : 2592, CA : 1660, OR: 1386               FALSE
##    UCS: 9657, UCL: 2032, PBN: 400, SSS: 366               FALSE
##                 UCS: 9657, Oth: 3508, NA: 0               FALSE
##    Mai: 12364, Bar: 279, Sad: 250, Hat: 150               FALSE
##      SU1: 440, SU1: 403, SU1: 392, FA1: 376               FALSE
##      Mil: 524, Sco: 513, Sti: 472, End: 468               FALSE
##      IP: 12012, BT2: 782, GSE: 336, TS3: 35               FALSE
##                 IP: 12012, Oth: 1153, NA: 0               FALSE
##  Cen: 5382, Sou: 2627, NUL: 1932, Nor: 1660               FALSE
##    Sum: 4552, Fal: 4489, Spr: 4112, Win: 12               FALSE
##      MCR: 524, SCT: 513, SWC: 472, END: 468               FALSE
##                                           P.o: 11416, K   FALSE
##       CA: 10548, OR: 1386, WA: 865, AK: 366               FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##              variable missing complete     n    mean      sd   p0  p25
##  marine_common_season       0    13165 13165  116.82   19.23   78  101
##    marine_common_year       0    13165 13165 2009.7     4.81 2000 2006
##     marine_sort_order       0    13165 13165 5566.24 1125.15 1720 5020
##       season_sequence       0    13165 13165    2.03    0.81    1    1
##              size_bin       0    13165 13165   82.44   42.2     5   50
##       size_sort_order       0    13165 13165    9.57    4.15    1    6
##                 total       0    13165 13165    9.92   16.43    1    2
##   p50  p75 p100     hist
##   118  131  152 ▅▅▆▇▇▇▇▅
##  2010 2013 2018 ▃▅▅▇▇▆▆▆
##  6090 6370 7650 ▁▁▁▂▃▃▇▁
##     2    3    4 ▇▁▇▁▁▇▁▁
##    80  110  320 ▅▇▇▃▁▁▁▁
##    10   12   33 ▅▇▇▃▁▁▁▁
##     4   11  360 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##   variable missing complete     n    mean   sd      p0     p25     p50
##   latitude       0    13165 13165   38.67 5.2    32.87   34.73   36.62
##  longitude       0    13165 13165 -122.25 2.86 -135.38 -123.98 -121.94
##      p75    p100     hist
##    41.65   57.05 ▇▆▃▂▁▁▁▁
##  -120.62 -117.25 ▁▁▁▁▅▇▆▃

Mapping Sample Sites

Create new data frame that just lists each site once

unique_sites <- subset(clean_data, !duplicated(clean_data$site_code)) 
#names(unique_sites)
sites_coordinates <- unique_sites[c(2,5:6,23)]
sites_coordinates_contig <- filter(sites_coordinates, sites_coordinates$state != "Alaska") 
#sites_coordinates
CA_coordinates <- filter(sites_coordinates, sites_coordinates$state == "CA")
OR_coordinates <- filter(sites_coordinates, sites_coordinates$state == "OR")
WA_coordinates <- filter(sites_coordinates, sites_coordinates$state == "WA")
AK_coordinates <- filter(sites_coordinates, sites_coordinates$state == "AK")

CA_coordinates <- as.data.frame(CA_coordinates)
OR_coordinates <- as.data.frame(OR_coordinates)
WA_coordinates <- as.data.frame(WA_coordinates)
AK_coordinates <- as.data.frame(AK_coordinates)

Plot sample sites

After messing around with several different mapping tools in R, I decided that trying to display the whole range (CA to AK) on one map would be really hard, and the Alaska sites are really just at one specific location, so I’m going to split Alaska off from the three states that are part of the contiguous US

Plot sites on state maps

Alaska:

world <- ne_countries(scale = "medium", returnclass = "sf")

AK_map <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= sites_coordinates,aes(x=sites_coordinates$longitude, y=sites_coordinates$latitude,), size = 4, alpha = 0.4,  color ="orange") +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-168, -131), ylim = c(51, 73), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") +
  geom_text(aes(x = -151, y = 66, label = "ALASKA", size = 600)) +
  geom_text(aes(x = -149.9, y = 61.4, label = "*", size = 600)) +
  geom_text(aes(x = -154, y = 62.6, label = "Anchorage")) +
  scale_x_continuous(breaks = c(-160, -150, -140)) + 
  theme(legend.position = "none")

AK_map 
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/Exploratory_Analysis_results/AK_sites_map.png",plot = AK_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
State <- sites_coordinates_contig$state

CA_map <- ggplot(data = world) +
  geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-126, -116), ylim = c(32, 42.4), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")+ 
  geom_text(aes(x = -122.7, y = 37.7, label = ">")) + 
  geom_text(aes(x = -123.5, y = 37.6, label = "San ")) +
  geom_text(aes(x = -124.2, y = 37.1, label = "Francisco")) +
  geom_text(aes(x = -120, y = 39, label = "CALIFORNIA")) + 
  geom_text(aes(x = -118.9, y = 33.7, label = ">")) + 
  geom_text(aes(x = -120.9, y = 33.6, label = "Los Angeles")) +
  theme(legend.position = "none") + 
  scale_x_continuous(breaks = c(-124, -122, -120, -118))

CA_map
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/Exploratory_Analysis_results/CA_sites_map.png",plot = CA_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
OR_map <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "br", width_hint = 0.3) +
  coord_sf(xlim = c(-127.5, -120.8), ylim = c(41.5, 47.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") + 
  geom_text(aes(x = -122.4, y = 44, label = "OREGON")) +
  geom_text(aes(x = -122.8, y = 45.6, label = "*")) + 
  geom_text(aes(x = -122.8, y = 45.4, label = "Portland")) + 
  theme(legend.position = "none") +
  scale_x_continuous(breaks = c(-126, -124, -122))

OR_map
## Scale on map varies by more than 10%, scale bar may be inaccurate

ggsave(filename = "../../results/Exploratory_Analysis_results/OR_sites_map.png",plot = OR_map, width = 3.5, height = 4) 
## Scale on map varies by more than 10%, scale bar may be inaccurate
WA_map <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= sites_coordinates_contig,aes(x=sites_coordinates_contig$longitude, y=sites_coordinates_contig$latitude,col=State), size = 3, alpha = 0.6, width = 0.2) +
  annotation_scale(location = "br", width_hint = 0.3) +
  coord_sf(xlim = c(-125.2, -121.3), ylim = c(45.8, 49.1), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") + 
  geom_text(aes(x = -122.5, y = 46.7, label = "WASHINGTON")) +
  geom_text(aes(x = -123.7, y = 47.93, label = "Olympic")) +
  geom_text(aes(x = -123.7, y = 47.75, label = "Peninsula")) +
  geom_text(aes(x = -121.9, y = 47.75, label = "< Seattle")) +
  theme(legend.position = "none") + 
  scale_x_continuous(breaks = c(-124, -123, -122))

WA_map 

ggsave(filename = "../../results/Exploratory_Analysis_results/WA_sites_map.png",plot = WA_map, width = 3.5, height = 4) 

Count Data by Sampling Location

Overall histogram of counts per sample survey

clean_data %>% ggplot() + 
  geom_histogram(aes(total))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What does the structure of the data look like by state?

CA

CA <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "CA")
summary(CA)
##       marine_site_name marine_common_year    size_bin     
##  Mill Creek   : 524    Min.   :2000       Min.   :  5.00  
##  Scott Creek  : 513    1st Qu.:2005       1st Qu.: 50.00  
##  Stillwater   : 472    Median :2009       Median : 80.00  
##  Enderts      : 468    Mean   :2009       Mean   : 81.92  
##  Terrace Point: 447    3rd Qu.:2013       3rd Qu.:110.00  
##  Andrew Molera: 443    Max.   :2018       Max.   :230.00  
##  (Other)      :7681                                       
##      total         state     
##  Min.   :  1.000   AK:    0  
##  1st Qu.:  2.000   CA:10548  
##  Median :  5.000   OR:    0  
##  Mean   :  9.607   WA:    0  
##  3rd Qu.: 11.000             
##  Max.   :360.000             
## 

OR

OR <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "OR")
summary(OR)
##       marine_site_name marine_common_year    size_bin     
##  Fogarty Creek:351     Min.   :2000       Min.   :  5.00  
##  Bob Creek    :334     1st Qu.:2005       1st Qu.: 50.00  
##  Burnt Hill   :260     Median :2010       Median : 80.00  
##  Ecola        :228     Mean   :2010       Mean   : 87.21  
##  Cape Arago   :213     3rd Qu.:2014       3rd Qu.:120.00  
##  Alegria      :  0     Max.   :2018       Max.   :230.00  
##  (Other)      :  0                                        
##      total        state    
##  Min.   :  1.00   AK:   0  
##  1st Qu.:  2.00   CA:   0  
##  Median :  7.00   OR:1386  
##  Mean   : 15.28   WA:   0  
##  3rd Qu.: 18.00            
##  Max.   :212.00            
## 

WA

WA <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "WA")
summary(WA)
##              marine_site_name marine_common_year    size_bin     
##  Saddlebag North Cove:163     Min.   :2008       Min.   :  5.00  
##  Post Point          :151     1st Qu.:2011       1st Qu.: 50.00  
##  Hat Island West     :116     Median :2014       Median : 90.00  
##  Point Grenville     :111     Mean   :2014       Mean   : 89.26  
##  Kydikabbit Point    :102     3rd Qu.:2016       3rd Qu.:120.00  
##  Saddlebag South East: 87     Max.   :2018       Max.   :320.00  
##  (Other)             :135                                        
##      total         state   
##  Min.   :  1.000   AK:  0  
##  1st Qu.:  1.000   CA:  0  
##  Median :  2.000   OR:  0  
##  Mean   :  6.776   WA:865  
##  3rd Qu.:  5.000           
##  Max.   :133.000           
## 

AK

AK <- clean_data %>% select(marine_site_name, marine_common_year, size_bin, total, state) %>%
  filter(state == "AK")
summary(AK)
##       marine_site_name marine_common_year    size_bin     
##  Sage Rock    :148     Min.   :2011       Min.   :  5.00  
##  Pirates Cove :131     1st Qu.:2013       1st Qu.: 40.00  
##  Kayak Island : 87     Median :2015       Median : 60.00  
##  Alegria      :  0     Mean   :2015       Mean   : 63.51  
##  Andrew Molera:  0     3rd Qu.:2017       3rd Qu.: 87.50  
##  Arroyo Hondo :  0     Max.   :2018       Max.   :190.00  
##  (Other)      :  0                                        
##      total        state   
##  Min.   :  1.00   AK:366  
##  1st Qu.:  1.00   CA:  0  
##  Median :  3.00   OR:  0  
##  Mean   :  6.03   WA:  0  
##  3rd Qu.:  7.00           
##  Max.   :117.00           
## 

[INSERT TABLE SUMMARIZING THIS INFORMATION]


Count Data by Sampling Time

What does sea star abundance over time look like?

Yearly scale

#clean_data %>% ggplot(aes(marine_common_year, total)) +  geom_jitter(width = 0.15, alpha = 0.25)
clean_data %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(alpha = 0.4) + 
  xlab("Year") + 
  ylab("log(Abundance by Site)")  

#clean_data %>% ggplot() + geom_density(aes(total, fill = marine_common_year))

Lets color the points by state

clean_data %>% ggplot(aes(marine_common_year, total, col=state)) +  geom_jitter(width = 0.15, alpha = 0.25)

Woah, California is dominating this visualization.

Lets break that up into 4 parts:

clean_data %>%   
  ggplot(aes(marine_common_year, total, col=state)) +  geom_jitter(width = 0.15, alpha = 0.25) +
  facet_wrap(~state)

Washington and Alaska clearly weren’t surveyed for all years during which California and Oregon were.

What do these counts look like if we display them as log()?

Abundance_State_Year_graphs <- clean_data %>%   
  ggplot(aes(marine_common_year, log(total), col=state)) +  geom_jitter(width = 0.15, alpha = 0.25) +
  facet_wrap(~state) +
  xlab("Year") + ylab("log(Abundance)")
Abundance_State_Year_graphs

ggsave(filename = "../../results/Processing_results/Abundance_State_Year_graphs.png",plot = Abundance_State_Year_graphs, width = 5, height = 4) 

```


Abundance over time

clean_data %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(alpha = 0.4, fill = "grey67") +
  xlab("Year") + 
  ylab("log(Abundance)")  


So far I’ve been looking at abundance across all 3 species surveyed. What does the abundance over time graph look like for each species, seperately?

Summary statistics for each species over time

P.ochra_only <- filter(clean_data, species_code == "P.ochraceus")
K.tuni_only <- filter(clean_data, species_code == "K.tunicata")
E.tros_only <- filter(clean_data, species_code == "E.troschelii")
#skim(P.ochra_only)
#skim(K.tuni_only)
#skim(E.tros_only)

Species abundance over time

P.ochra_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochraceus Abundance)")  

K.tuni_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orange3", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(K.tunicata Abundance)")  

E.tros_only %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot(fill = "skyblue3", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(E.troschelii Abundance)")  

That last one looked like the log() function might not be applicable. Let’s see what it looks like without that:

E.tros_only %>% ggplot(aes(marine_common_year, total, group = marine_common_year)) + geom_boxplot(fill = "skyblue3", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(E.troschelii Abundance)") 


Characterizing sea star abundance in the last decade

The most recent and catastrophic sea star wasting disease outbreak hit the west coast of North America around 2013. How did sea star abundance look before the epidemic, how did it change during the epidemic, and have the populations rebounded?

last_decade <- filter(clean_data, marine_common_year >= "2010")
last_decade %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) +
  geom_boxplot(alpha = 0.4, fill = "grey67") +
  xlab("Year") + 
  ylab("log(Abundance)") 

It looks like populations leveled back out at 2016 and remained relatively unchanged in 2017 and 2018.


2013-2016 Abundance Data

It looks like theres a detectable decrease between 2013 and 2014, and that populations leveled back out at 2016 and remained relatively unchanged in 2017 and 2018. Lets narrow the time frame to only include sampling between 2013 and 2016.

last_decade$marine_common_year <- as.factor(last_decade$marine_common_year)
SSWD_timeframe <- filter(clean_data, marine_common_year >= "2013" & marine_common_year <= "2016")
SSWD_timeframe$marine_common_year <- as.factor(SSWD_timeframe$marine_common_year)
SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot(alpha = 0.4, fill = "grey67") +
  xlab("Year") + 
  ylab("log(Abundance)")

By species

P.ochra_SSWD_tf <- filter(SSWD_timeframe, species_code == "P.ochraceus")
K.tuni_SSWD_tf <- filter(SSWD_timeframe, species_code == "K.tunicata")
E.tros_SSWD_tf <- filter(SSWD_timeframe, species_code == "E.troschelii")
#skim(P.ochra_SSWD_tf)
#skim(K.tuni_SSWD_tf)
#skim(E.tros_SSWD_tf)
P.ochra_SSWD_tf %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochraceus Abundance)")  

K.tuni_SSWD_tf %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orange3", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(K.tunicata Abundance)")  

E.tros_SSWD_tf %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + geom_boxplot(fill = "skyblue3", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(E.troschelii Abundance)")  

Breaking down years into sampling seasons

P.ochra_SSWD_tf_fig <- P.ochra_SSWD_tf %>% ggplot(aes(marine_common_season, log(total), group = marine_common_season)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("SeasonYear") + 
  ylab("log(P.ochra Abundance)") +
  scale_x_continuous(breaks = c("Sp13" = 129, "Sm13" = 130, "Fl13" = 131, "Sp14" = 133, "Sm14" = 134, "Fl14" = 135, "Sp15" = 137, "Sm15" = 138, "Fl15" = 139, "Sp16" = 141, "Sm16" = 142, "Fl16" = 143))
P.ochra_SSWD_tf_fig

# Used to code above x-axis
#numeric_seasons_code_key <- readRDS("../../data/processed_data/numeric_seasons_code_key.rds")
#numeric_seasons_code_key
K.tuni_SSWD_tf_fig <- K.tuni_SSWD_tf %>% ggplot(aes(marine_common_season, log(total), group = marine_common_season)) + 
  geom_boxplot(fill = "orange3", alpha = 0.4) +
  xlab("SeasonYear") + 
  ylab("log(K.tuni Abundance)") +
  scale_x_continuous(breaks = c("Sp13" = 129, "Sm13" = 130, "Fl13" = 131, "Sp14" = 133, "Sm14" = 134, "Fl14" = 135, "Sp15" = 137, "Sm15" = 138, "Fl15" = 139, "Sp16" = 141, "Sm16" = 142, "Fl16" = 143)) 
K.tuni_SSWD_tf_fig

E.tros_SSWD_tf_fig <- E.tros_SSWD_tf %>% ggplot(aes(marine_common_season, log(total), group = marine_common_season)) + 
  geom_boxplot(fill = "skyblue3", alpha = 0.4) +
  xlab("SeasonYear") + 
  ylab("log(E.tros Abundance)") +
  scale_x_continuous(breaks = c("Sp13" = 129, "Sm13" = 130, "Fl13" = 131, "Sp14" = 133, "Sm14" = 134, "Fl14" = 135, "Sp15" = 137, "Sm15" = 138, "Fl15" = 139, "Sp16" = 141, "Sm16" = 142, "Fl16" = 143)) 
E.tros_SSWD_tf_fig

grid.arrange(P.ochra_SSWD_tf_fig, K.tuni_SSWD_tf_fig, nrow=2)

By state

P.ochra_SSWD_tf_fig + facet_wrap(~ state)

P.ochra_SSWDtf_CA <- filter(P.ochra_SSWD_tf, state == "CA")

P.ochra_SSWDtf_CA_fig <- P.ochra_SSWDtf_CA %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochra Abundance)") 
P.ochra_SSWDtf_CA_fig <- P.ochra_SSWDtf_CA_fig + facet_wrap(~ marine_sort_order)
ggsave(filename = "../../results/Exploratory_Analysis_results/P.ochra_SSWDtf_CA_fig.png",plot = P.ochra_SSWDtf_CA_fig, width = 5, height = 4) 
P.ochra_SSWDtf_CA %>% ggplot2::ggplot() + geom_histogram(aes(P.ochra_SSWDtf_CA$total)) + facet_wrap(~ marine_common_year, 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

P.ochra_SSWDtf_highcount_temp <- P.ochra_SSWD_tf %>% filter(marine_common_year == "2013" & total >= 10)
unique(P.ochra_SSWDtf_highcount_temp$site_code)
##  [1] BML  BOA  BOB  BRN  CARP CRCO DAPT DMN  END  ESP  FKC  FOG  GREN HOP 
## [15] KIB  MCR  MEN  MOL  MUSH OCC  OLDS PIRA POP  SAGE SBSE SCT  SEA  SHCO
## [29] SWC  TPT  TRIS WHPT
## 77 Levels: ALEG ARG ARHO ARN BML BOA BOB BOD BRN CARP CAY CHM ... WIN
P.ochra_SSWDtf_highcount <- P.ochra_SSWD_tf %>% filter(site_code == "BML" | site_code == "BOA" | site_code == "BOB" | site_code == "BRN" | site_code == "CARP" | site_code == "CRCO" | site_code == "DAPT" | site_code == "DMN" | site_code == "END" | site_code == "ESP" | site_code == "FKC" | site_code == "FOG" | site_code == "GREN" | site_code == "HOP" | site_code == "KIB" | site_code == "MCR" | site_code == "MEN" | site_code == "MOL" | site_code == "MUSH" | site_code == "OCC" | site_code == "OLDS" | site_code == "PIRA" | site_code == "POP" | site_code == "SAGE" | site_code == "SBSE" | site_code == "SCT" | site_code == "SEA" | site_code == "SHCO" | site_code == "SWC" | site_code == "TPT" | site_code == "TRIS" | site_code == "WHPT")
head(P.ochra_SSWDtf_highcount)
##   group_code site_code marine_site_name marine_sort_order latitude
## 1       UCSC       BML           Bodega              5200  38.3182
## 2       UCSC       BML           Bodega              5200  38.3182
## 3       UCSC       BML           Bodega              5200  38.3182
## 4       UCSC       BML           Bodega              5200  38.3182
## 5       UCSC       BML           Bodega              5200  38.3182
## 6       UCSC       BML           Bodega              5200  38.3182
##   longitude marine_common_season marine_season_code marine_common_year
## 1 -123.0737                  130               SU13               2013
## 2 -123.0737                  130               SU13               2013
## 3 -123.0737                  130               SU13               2013
## 4 -123.0737                  130               SU13               2013
## 5 -123.0737                  130               SU13               2013
## 6 -123.0737                  130               SU13               2013
##   season_sequence season_name method_code species_code size_sort_order
## 1               2      Summer          IP  P.ochraceus               3
## 2               2      Summer          IP  P.ochraceus               4
## 3               2      Summer          IP  P.ochraceus               5
## 4               2      Summer          IP  P.ochraceus               6
## 5               2      Summer          IP  P.ochraceus               7
## 6               2      Summer          IP  P.ochraceus               8
##   size_bin total          mpa_region        georegion          bioregion
## 1       20     1 North Central Coast CA North Central OlyCstWA_2_SanFran
## 2       30     2 North Central Coast CA North Central OlyCstWA_2_SanFran
## 3       40     1 North Central Coast CA North Central OlyCstWA_2_SanFran
## 4       50     1 North Central Coast CA North Central OlyCstWA_2_SanFran
## 5       60     5 North Central Coast CA North Central OlyCstWA_2_SanFran
## 6       70    15 North Central Coast CA North Central OlyCstWA_2_SanFran
##     island group_code_UCSC_other method_code_IP_other state
## 1 Mainland                  UCSC                   IP    CA
## 2 Mainland                  UCSC                   IP    CA
## 3 Mainland                  UCSC                   IP    CA
## 4 Mainland                  UCSC                   IP    CA
## 5 Mainland                  UCSC                   IP    CA
## 6 Mainland                  UCSC                   IP    CA
P.ochra_SSWDtf_highcount %>% ggplot2::ggplot() + geom_histogram(aes(P.ochra_SSWDtf_highcount$total)) + facet_wrap(~ marine_common_year, 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

P.ochra_SSWDtf_highcount_fig <- P.ochra_SSWDtf_highcount %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochra Abundance)") 
P.ochra_SSWDtf_highcount_fig <- P.ochra_SSWDtf_highcount_fig + facet_wrap(~ marine_sort_order)
P.ochra_SSWDtf_highcount_fig

Over 25

P.ochra_SSWDtf_highercount_temp <- P.ochra_SSWD_tf %>% filter(marine_common_year == "2013" & total >= 25)
unique(P.ochra_SSWDtf_highercount_temp$site_code)
##  [1] BOB  BRN  CRCO DMN  END  FKC  FOG  GREN MCR  MOL  MUSH OCC  PIRA SCT 
## [15] SEA  SHCO SWC  TRIS
## 77 Levels: ALEG ARG ARHO ARN BML BOA BOB BOD BRN CARP CAY CHM ... WIN
P.ochra_SSWDtf_over25 <- P.ochra_SSWD_tf %>% filter(site_code == "BOB" | site_code == "BRN" | site_code == "CRCO" | site_code == "DMN" | site_code == "END" | site_code == "FKC" | site_code == "FOG" | site_code == "GREN" | site_code == "MCR" | site_code == "MOL" | site_code == "MUSH" | site_code == "OCC" | site_code == "PIRA" | site_code == "SCT" | site_code == "SEA" | site_code == "SHCO" | site_code == "SWC" | site_code == "TRIS")

P.ochra_SSWDtf_over25_fig <- P.ochra_SSWDtf_over25 %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochra Abundance)") 
P.ochra_SSWDtf_over25_fig <- P.ochra_SSWDtf_over25_fig + facet_wrap(~ marine_sort_order)
P.ochra_SSWDtf_over25_fig

ggsave(filename = "../../results/Exploratory_Analysis_results/P.ochra_SSWDtf_over25_fig.png",plot = P.ochra_SSWDtf_over25_fig, width = 5, height = 4) 
P.ochra_SSWDtf_highestcount_temp <- P.ochra_SSWD_tf %>% filter(marine_common_year == "2013" & total >= 40)
unique(P.ochra_SSWDtf_highestcount_temp$site_code)
## [1] DMN  END  FKC  FOG  GREN MCR  OCC  SHCO
## 77 Levels: ALEG ARG ARHO ARN BML BOA BOB BOD BRN CARP CAY CHM ... WIN
P.ochra_SSWDtf_over40 <- P.ochra_SSWD_tf %>% filter(site_code == "DMN" | site_code == "END" | site_code == "FKC" | site_code == "FOG" | site_code == "GREN" | site_code == "MCR" | site_code == "OCC" | site_code == "SHCO")

P.ochra_SSWDtf_over40_fig <- P.ochra_SSWDtf_over40 %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year)) + 
  geom_boxplot(fill = "orchid4", alpha = 0.4) +
  xlab("Year") + 
  ylab("log(P.ochra Abundance)") 
P.ochra_SSWDtf_over40_fig <- P.ochra_SSWDtf_over40_fig + facet_wrap(~ marine_sort_order, nrow =2)
P.ochra_SSWDtf_over40_fig

ggsave(filename = "../../results/Exploratory_Analysis_results/P.ochra_SSWDtf_over40_fig.png",plot = P.ochra_SSWDtf_over40_fig, width = 5, height = 4)
SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")


Fitting a linear model

SSWD_timeframe %>% ggplot(aes(marine_common_year, log(total), col=state)) +  geom_jitter(width = 0.45, alpha = 0.25) +
  facet_wrap(~state, 1) + geom_smooth(method = "lm", col="black") + theme(legend.position = "none")

Just California

SSWD_timeframe_CA <- filter(SSWD_timeframe, state == "CA")
SSWD_timeframe_CA$marine_common_year <- as.factor(SSWD_timeframe_CA$marine_common_year)
SSWD_timeframe_CA %>% ggplot(aes(marine_common_year, log(total), col=marine_common_year)) +  geom_jitter(width = 0.5, alpha = 0.25) + geom_smooth(method = "lm", col="red") + theme(legend.position = "none")


Doing the same as above, but separating by species, with a primary focus on Pisaster ochraceus (the organism I’m studing in my research)

SSWD_timeframe_P.ochra <- filter(P.ochra_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_P.ochra$marine_common_year <- as.factor(SSWD_timeframe_P.ochra$marine_common_year)

SSWD_timeframe_P.ochra %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

SSWD_timeframe_K.tuni <- filter(K.tuni_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_K.tuni$marine_common_year <- as.factor(SSWD_timeframe_K.tuni$marine_common_year)

SSWD_timeframe_K.tuni %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

SSWD_timeframe_E.tros <- filter(E.tros_only, marine_common_year >= "2013" & marine_common_year <= "2016")

SSWD_timeframe_E.tros$marine_common_year <- as.factor(SSWD_timeframe_E.tros$marine_common_year)

SSWD_timeframe_E.tros %>% ggplot(aes(marine_common_year, log(total), group = marine_common_year, col=marine_common_year)) + geom_boxplot() +   facet_wrap(~state, 1) + theme(legend.position = "none")

OH! I didn’t catch that the reason why E.tros had such few survey sightings was because its range appears to only be in Alaska. Speaking of which….


I should go back and plot each survey entry and color by species observed to get an idea of the ranges of each.

Species <- SSWD_timeframe$species_code
broad_jitter <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species), size = 2, alpha = 0.3, width = 3) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-128, -115), ylim = c(32, 49.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")

narrow_jitter <- ggplot(data = world) +
    geom_sf() +
  geom_jitter(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species), size = 2, alpha = 0.3, width = 0.3) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-128, -115), ylim = c(32, 49.5), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")

gridExtra::grid.arrange(broad_jitter,narrow_jitter,nrow=2)
## Scale on map varies by more than 10%, scale bar may be inaccurate
## Scale on map varies by more than 10%, scale bar may be inaccurate

AK_zoom1_SSWD <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species,), size = 2, alpha = 0.4) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-140, -130), ylim = c(55, 62), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") 
  

AK_zoom2_SSWD <- ggplot(data = world) +
    geom_sf() +
  geom_point(data= SSWD_timeframe,aes(x=SSWD_timeframe$longitude, y=SSWD_timeframe$latitude,col=Species,), size = 2, alpha = 0.4) +
  annotation_scale(location = "bl", width_hint = 0.3) +
  coord_sf(xlim = c(-136, -135), ylim = c(56.6, 57.3), expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude") 
  
grid.arrange(AK_zoom1_SSWD, AK_zoom2_SSWD, nrow = 2)
## Scale on map varies by more than 10%, scale bar may be inaccurate